packages.used <- c("ggplot2", "dplyr", "tibble", "tidyr", "stringr", "tidytext", "topicmodels", "wordcloud2", "wordcloud", "ggridges","corrplot")
# check packages that need to be installed.
packages.needed <- setdiff(packages.used, intersect(installed.packages()[,1], packages.used))
# install additional packages
if(length(packages.needed) > 0) {
install.packages(packages.needed, dependencies = TRUE, repos = 'http://cran.us.r-project.org')
}
library(ggplot2)
library(dplyr)
library(tibble)
library(tidyr)
library(stringr)
library(tidytext)
library(topicmodels)
library(wordcloud2)
library(wordcloud)
library(ggridges)
library(corrplot)
source("../libs/multiplot.R")
spooky <- read.csv('../data/spooky.csv', as.is = TRUE)
head(spooky)
summary(spooky)
## id text author
## Length:19579 Length:19579 Length:19579
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
# View(spooky)
length(unique(spooky$id)) == nrow(spooky)
## [1] TRUE
length(unique(spooky$text)) == nrow(spooky)
## [1] TRUE
sum(is.na(spooky))
## [1] 0
unnest_tokens() function to drop all punctuation and transform all words into lower case.# Make a table with one word per row and remove `stop words` (i.e. the common words).
spooky_wrd <- unnest_tokens(spooky, word, text)
spooky_wrd <- anti_join(spooky_wrd, stop_words, by = "word")
spooky_wrd_EAP <- spooky_wrd%>%
dplyr::count(author,word)%>%
dplyr::group_by(author)%>%
tidyr::spread(author, n)%>%
dplyr::select(word, EAP) %>%
dplyr::filter(!is.na(EAP))%>%
dplyr::arrange(desc(EAP))
figPath = system.file("examples/octopus.jpg",package = "wordcloud2")
# jpeg("/Users/yunli/Documents/GitHub/spring2018-project1-YUNLI531/figs/Wordcloud_EAP")
wordcloud2(spooky_wrd_EAP, figPath = figPath, size = 0.5, color = "random-light", backgroundColor = "black")
spooky_wrd_MWS <- spooky_wrd%>%
dplyr::count(author,word)%>%
dplyr::group_by(author)%>%
tidyr::spread(author, n)%>%
dplyr::select(word, MWS) %>%
dplyr::filter(!is.na(MWS))%>%
dplyr::arrange(desc(MWS))
figPath = system.file("examples/white-ghost-hi.jpg",package = "wordcloud2")
# jpeg("/Users/yunli/Documents/GitHub/spring2018-project1-YUNLI531/figs/Wordcloud_MWS")
wordcloud2(spooky_wrd_MWS, figPath = figPath, size = 0.5, color = "random-light", backgroundColor = "black")
spooky_wrd_HPL <- spooky_wrd%>%
dplyr::count(author,word)%>%
dplyr::group_by(author)%>%
tidyr::spread(author, n)%>%
dplyr::select(word, HPL) %>%
dplyr::filter(!is.na(HPL))%>%
dplyr::arrange(desc(HPL))
figPath = system.file("examples/black-bird-hi.png",package = "wordcloud2")
# jpeg("/Users/yunli/Documents/GitHub/spring2018-project1-YUNLI531/figs/Wordcloud_HPL")
wordcloud2(spooky_wrd_HPL, figPath = figPath, size = 0.5, color = "random-light", backgroundColor = "black")
p1 <- ggplot(spooky) +
geom_bar(aes(author, fill = author)) +
theme(legend.position = "none")
spooky$sen_length <- str_length(spooky$text)
head(spooky$sen_length)
## [1] 231 71 200 206 174 468
p2 <- ggplot(spooky) +
geom_density_ridges(aes(sen_length, author, fill = author)) +
scale_x_log10() +
theme(legend.position = "none") +
labs(x = "Sentence length [# characters]")
spooky_wrd$word_length <- str_length(spooky_wrd$word)
head(spooky_wrd$word_length)
## [1] 7 8 5 12 10 7
p3 <- ggplot(spooky_wrd) +
geom_density(aes(word_length, fill = author), bw = 0.05, alpha = 0.3) +
scale_x_log10() +
theme(legend.position = "none") +
labs(x = "Word length [# characters]")
# jpeg("/Users/yunli/Documents/GitHub/spring2018-project1-YUNLI531/figs/data_visualization")
layout <- matrix(c(1, 2, 1, 3), 2, 2, byrow = TRUE)
multiplot(p1, p2, p3, layout = layout)
## Loading required package: grid
## Picking joint bandwidth of 0.0414
get_sentiments('nrc')
sentiments <- inner_join(spooky_wrd, get_sentiments('nrc'), by = "word")
count(sentiments, sentiment)
count(sentiments, author, sentiment)
# jpeg("/Users/yunli/Documents/GitHub/spring2018-project1-YUNLI531/figs/NRC_all_bar")
ggplot(count(sentiments, sentiment)) +
geom_col(aes(sentiment, n, fill = sentiment))
# jpeg("/Users/yunli/Documents/GitHub/spring2018-project1-YUNLI531/figs/NRC_author")
ggplot(count(sentiments, author, sentiment)) +
geom_col(aes(sentiment, n, fill = sentiment)) +
facet_wrap(~ author) +
coord_flip() +
theme(legend.position = "none")
# jpeg("/Users/yunli/Documents/GitHub/spring2018-project1-YUNLI531/figs/NRC_HPL")
sentiments_HPL <- sentiments%>%
filter(author == "HPL" & (sentiment == "positive" | sentiment == "negative")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
group_by(sentiments_HPL,sentiment) %>%
top_n(10, n) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to negative/positive sentiment", x = NULL) +
coord_flip() +
ggtitle("HP Lovecraft - Sentiment analysis")
get_sentiments('bing')
sentiments_bing <- inner_join(spooky_wrd, get_sentiments('bing'), by = "word")
count(sentiments_bing, sentiment)
count(sentiments_bing, author, sentiment)
# jpeg("/Users/yunli/Documents/GitHub/spring2018-project1-YUNLI531/figs/bing_all_bar")
ggplot(count(sentiments_bing, sentiment)) +
geom_col(aes(sentiment, n, fill = sentiment))
# jpeg("/Users/yunli/Documents/GitHub/spring2018-project1-YUNLI531/figs/bing_author")
ggplot(count(sentiments_bing, author, sentiment)) +
geom_col(aes(sentiment, n, fill = sentiment)) +
facet_wrap(~ author) +
coord_flip() +
theme(legend.position = "none")
# jpeg("/Users/yunli/Documents/GitHub/spring2018-project1-YUNLI531/figs/bing_HPL")
sentiments_bing_HPL <- sentiments_bing%>%
filter(author == "HPL") %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
group_by(sentiments_bing_HPL,sentiment) %>%
top_n(10, n) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to negative/positive sentiment", x = NULL) +
coord_flip() +
ggtitle("HP Lovecraft - Sentiment analysis")
# jpeg("/Users/yunli/Documents/GitHub/spring2018-project1-YUNLI531/figs/bing_MWS")
sentiments_bing_MWS <- sentiments_bing%>%
filter(author == "MWS") %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
group_by(sentiments_bing_MWS,sentiment) %>%
top_n(10, n) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to negative/positive sentiment", x = NULL) +
coord_flip() +
ggtitle("Mary Shelley - Sentiment analysis")
bing_neg <- filter(get_sentiments('bing'), sentiment == "negative")
bing_neg
negative <- inner_join(spooky_wrd, bing_neg, by = "word")
head(negative)
count(negative, word, sort = TRUE)
neg_words <- count(group_by(negative, word, author))
neg_words_all <- count(group_by(negative, word))
neg_words <- left_join(neg_words, neg_words_all, by = "word")
neg_words <- arrange(neg_words, desc(n.y))
neg_words <- ungroup(head(neg_words, 81))
# jpeg("/Users/yunli/Documents/GitHub/spring2018-project1-YUNLI531/figs/Freq_negative")
ggplot(neg_words) +
geom_col(aes(reorder(word, n.y, FUN = min), n.x, fill = author)) +
xlab(NULL) +
coord_flip() +
facet_wrap(~ author) +
theme(legend.position = "none")
spooky_wrd_tm is a sparse matrix with 19467 rows, corresponding to the 19467 ids (or originally, sentences) in the spooky_wrd dataframe, and 24941 columns corresponding to the total number of unique words in the spooky_wrd dataframe. So each row of spooky_wrd_tm corresponds to one of the original sentences. The value of the matrix at a certain position is then the number of occurences of that word (determined by the column) in this specific sentence (determined by the row). Since most sentence/word pairings don’t occur, the matrix is sparse meaning there are many zeros.spooky_wrd_lda <- LDA(spooky_wrd_tm, k = 6, control = list(seed = 1234))
spooky_wrd_topics <- tidy(spooky_wrd_lda, matrix = "beta")
spooky_wrd_topics
spooky_wrd_lda
## A LDA_VEM topic model with 6 topics.
tidy function to extract the per-topic-per-word probabilities, called “beta” or \(\beta\), for the model. The final output has a one-topic-per-term-per-row format. For each combination, the model computes the probability of that term being generated from that topic.# Grab the top five words for each topic.
spooky_wrd_topics_5 <- ungroup(top_n(group_by(spooky_wrd_topics, topic), 5, beta))
spooky_wrd_topics_5 <- arrange(spooky_wrd_topics_5, topic, -beta)
spooky_wrd_topics_5 <- mutate(spooky_wrd_topics_5, term = reorder(term, beta))
# jpeg("/Users/yunli/Documents/GitHub/spring2018-project1-YUNLI531/figs/topic_modeling")
ggplot(spooky_wrd_topics_5) +
geom_col(aes(term, beta, fill = factor(topic)), show.legend = FALSE) +
facet_wrap(~ topic, scales = "free", ncol = 3) +
coord_flip()